The Thera bank recently saw a steep decline in the number of users of their credit card, credit cards are a good source of income for banks because of different kinds of fees charged by the banks like annual fees, balance transfer fees, and cash advance fees, late payment fees, foreign transaction fees, and others. Some fees are charged to every user irrespective of usage, while others are charged under specified circumstances.
Customers’ leaving credit cards services would lead bank to loss, so the bank wants to analyze the data of customers and identify the customers who will leave their credit card services and reason for same – so that bank could improve upon those areas
You as a Data scientist at Thera bank need to come up with a classification model that will help the bank improve its services so that customers do not renounce their credit cards
You need to identify the best possible model that will give the required performance
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.ensemble import (
AdaBoostClassifier,
GradientBoostingClassifier,
RandomForestClassifier,
BaggingClassifier
)
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import (
f1_score,
accuracy_score,
precision_score,
recall_score,
confusion_matrix,
roc_auc_score,
plot_confusion_matrix
)
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import RandomOverSampler
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
from google.colab import drive
drive.mount('/content/drive')
churn = pd.read_csv('/content/drive/MyDrive/Python Course/BankChurners.csv')
data = churn.copy()
data.head()
data.shape
data.info()
data['Gender'] = data['Gender'].astype('category')
data['Education_Level'] = data['Education_Level'].astype('category')
data['Marital_Status'] = data['Marital_Status'].astype('category')
data['Income_Category'] = data['Income_Category'].astype('category')
data['Card_Category'] = data['Card_Category'].astype('category')
data.isnull().sum()
data.duplicated().sum()
data.describe().T
data['CLIENTNUM'].nunique()
data.drop('CLIENTNUM', axis=1, inplace=True)
data.head(2)
# create a list for categorical columns
cat_cols = ['Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']
for column in cat_cols:
print(data[column].value_counts())
print('-'*50)
# Replacing 'abc' with 'Less than $40k'
data['Income_Category'] = data['Income_Category'].replace('abc', 'Less than $40K')
data['Income_Category'].value_counts()
imputer = SimpleImputer(strategy='most_frequent')
impute_mode_cols = ['Education_Level', 'Marital_Status']
data[impute_mode_cols] = imputer.fit_transform(data[impute_mode_cols])
data.isnull().sum()
# function to plot a boxplot and histogram
def hist_box(data, feature, figsize=(12, 7), kde=False, bins=None):
"""
Boxplot and histogram
data: dataframe
feature: dataframe column
figsize: size of figure
kde: whether to show the density curve (set to False)
bins: bins for histogram (set to None)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2,
sharex=True,
gridspec_kw={'height_ratios': (0.25, 0.75)},
figsize=figsize
)
sns.boxplot(
data=data, x=feature, ax=ax_box2, showmeans=True, color='orange'
)
sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette='winter'
) if bins else sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2
)
ax_hist2.axvline(
data[feature].mean(), color='green', linestyle='--'
)
ax_hist2.axvline(
data[feature].median(), color='black', linestyle='-'
)
hist_box(data, 'Customer_Age')
hist_box(data, 'Dependent_count')
hist_box(data, 'Months_on_book')
Months on book has a slight left skew but still a fairly normal distribution. It really peaks with a large majority of customers being on books for 36-38 months or just over 3 years.
There are a few outliers on both ends of the spectrum but they are valid data points and are not cause for any outlier treatment.
hist_box(data, 'Total_Relationship_Count')
hist_box(data, 'Months_Inactive_12_mon')
hist_box(data, 'Contacts_Count_12_mon')
hist_box(data, 'Credit_Limit')
hist_box(data, 'Avg_Open_To_Buy')
hist_box(data, 'Total_Revolving_Bal')
hist_box(data, 'Total_Trans_Amt')
hist_box(data, 'Total_Trans_Ct')
hist_box(data, 'Avg_Utilization_Ratio')
hist_box(data, 'Total_Amt_Chng_Q4_Q1')
hist_box(data, 'Total_Ct_Chng_Q4_Q1')
# function to create labeled barplots
def barplot_labeled(data, feature, perc=False, n=None):
"""
Barplot with percentages
data: dataframe
feature: dataframe column
perc: whether to display percentages instead of count (set to False)
n: displays top n category levels (set to None to display all)
"""
total = len(data[feature])
count = data[feature].nunique()
if n is None:
plt.figure(figsize=(count + 1, 5))
else:
plt.figure(figsize=(n+1, 5))
plt.xticks(rotation=90, fontsize=15)
ax = sns.countplot(
data=data,
x=feature,
palette="Paired",
order=data[feature].value_counts().index[:n].sort_values(),
)
for p in ax.patches:
if perc == True:
label = "{:.1f}%".format(
100 * p.get_height() / total
)
else:
label = p.get_height()
x = p.get_x() + p.get_width() / 2
y = p.get_height()
ax.annotate(
label,
(x, y),
ha="center",
va="center",
size=12,
xytext=(0, 5),
textcoords="offset points",
)
plt.show()
barplot_labeled(data, 'Gender')
barplot_labeled(data, 'Education_Level')
barplot_labeled(data, 'Marital_Status')
barplot_labeled(data, 'Income_Category')
barplot_labeled(data, 'Card_Category')
barplot_labeled(data, 'Attrition_Flag')
Let's first take a look at the relationship between our repsonse variable and the numeric variables.
plt.figure(figsize=(20, 10))
sns.countplot(data=data, x='Customer_Age', hue='Attrition_Flag')
plt.show()
plt.figure(figsize=(15, 8))
sns.countplot(data=data, x='Dependent_count', hue='Attrition_Flag')
plt.show()
plt.figure(figsize=(20, 10))
sns.countplot(data=data, x='Months_on_book', hue='Attrition_Flag')
plt.show()
plt.figure(figsize=(15, 8))
sns.countplot(data=data, x='Total_Relationship_Count', hue='Attrition_Flag')
plt.show()
plt.figure(figsize=(15, 8))
sns.countplot(data=data, x='Months_Inactive_12_mon', hue='Attrition_Flag')
plt.show()
plt.figure(figsize=(15, 8))
sns.countplot(data=data, x='Contacts_Count_12_mon', hue='Attrition_Flag')
plt.show()
plt.figure(figsize=(12,7))
sns.boxplot(data=data, y='Credit_Limit', x='Attrition_Flag')
plt.show()
plt.figure(figsize=(12,7))
sns.boxplot(data=data, y='Total_Revolving_Bal', x='Attrition_Flag')
plt.show()
plt.figure(figsize=(12,7))
sns.boxplot(data=data, y='Avg_Open_To_Buy', x='Attrition_Flag')
plt.show()
plt.figure(figsize=(12,7))
sns.boxplot(data=data, y='Total_Amt_Chng_Q4_Q1', x='Attrition_Flag')
plt.show()
plt.figure(figsize=(12,7))
sns.boxplot(data=data, y='Total_Ct_Chng_Q4_Q1', x='Attrition_Flag')
plt.show()
plt.figure(figsize=(12,7))
sns.boxplot(data=data, y='Total_Trans_Amt', x='Attrition_Flag')
plt.show()
plt.figure(figsize=(12,7))
sns.boxplot(data=data, y='Total_Trans_Ct', x='Attrition_Flag')
plt.show()
plt.figure(figsize=(12,7))
sns.boxplot(data=data, y='Avg_Utilization_Ratio', x='Attrition_Flag')
plt.show()
(pd.crosstab(data['Gender'], data['Attrition_Flag'], normalize='index') * 100).plot(
kind='bar', figsize=(10, 6), stacked=True, color=['orange', 'blue']
)
plt.ylabel('Percent Response');
(pd.crosstab(data['Education_Level'], data['Attrition_Flag'], normalize='index') * 100).plot(
kind='bar', figsize=(10, 6), stacked=True, color=['orange', 'blue']
)
plt.ylabel('Percent Response');
(pd.crosstab(data['Marital_Status'], data['Attrition_Flag'], normalize='index') * 100).plot(
kind='bar', figsize=(10, 6), stacked=True, color=['orange', 'blue']
)
plt.ylabel('Percent Response');
(pd.crosstab(data['Income_Category'], data['Attrition_Flag'], normalize='index') * 100).plot(
kind='bar', figsize=(10, 6), stacked=True, color=['orange', 'blue']
)
plt.ylabel('Percent Response');
(pd.crosstab(data['Card_Category'], data['Attrition_Flag'], normalize='index') * 100).plot(
kind='bar', figsize=(10, 6), stacked=True, color=['orange', 'blue']
)
plt.ylabel('Percent Response')
plt.figure(figsize=(24, 14))
sns.heatmap(data.corr(), annot=True, vmax=1, vmin=-1, cmap='Spectral')
plt.show()
sns.pairplot(data, hue='Attrition_Flag')
plt.show()
Let's first start by dropping some columns in order to help avoid multicollinearity.
data.drop(['Avg_Open_To_Buy'], axis=1, inplace=True)
data.drop(['Total_Trans_Ct'], axis=1, inplace=True)
data['Attrition_Flag'] = data['Attrition_Flag'].replace('Attrited Customer', 1)
data['Attrition_Flag'] = data['Attrition_Flag'].replace('Existing Customer', 0)
data.head()
data['Attrition_Flag'].value_counts()
X = data.drop(columns='Attrition_Flag')
X = pd.get_dummies(X)
Y = data['Attrition_Flag']
# We will split the data into training, validation, and test sets
# 1st we split the data into temp and test sets
X_temp, X_test, Y_temp, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1, stratify=Y)
# then we split temp into training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(X_temp, Y_temp, test_size =0.25, random_state=1, stratify=Y_temp)
print(X_train.shape, X_val.shape, X_test.shape)
We know that our Target variable 'Attrition Flag' has a high class imbalance. In order to try and account for this in our models we will try out four class balancing strategies.
# creat a function for our class balancing strategies
# we will create 5 choices to represent our straegies (0-4)
def data_balancing(X_train, Y_train, choice):
if choice==0:
sm = SMOTE(sampling_strategy=1, k_neighbors=5, random_state=1)
elif choice==1:
sm = RandomUnderSampler(random_state=1)
elif choice==2:
sm = NearMiss(version=1)
elif choice==3:
sm = RandomOverSampler(random_state=1)
elif choice==4:
return X_train, Y_train
# Creating X and Y train sample in our to resample out training data
X_train_sample, Y_train_sample = sm.fit_resample(X_train, Y_train)
return X_train_sample, Y_train_sample
def create_model(choice):
"""
choice = choice of class balancing strategy
from our data_balancing function
"""
models = [] # empty list for storing our models
# We will append our models to the list
models.append(('Bagging', BaggingClassifier(random_state=1)))
models.append(('Random Forest', RandomForestClassifier(random_state=1)))
models.append(('Gradient Boost', GradientBoostingClassifier(random_state=1)))
models.append(('AdaBoost', AdaBoostClassifier(random_state=1)))
models.append(('XGBoost', XGBClassifier(random_state=1, eval_metric='logloss')))
models.append(('DTree', DecisionTreeClassifier(random_state=1)))
results = [] # empty list for model CV scores
names = [] # empty list for model names
# creat loop to get the mean CV score for each modeal
print('\n' ' CV Performance:' '\n')
for name, model in models:
scoring = 'recall' # scoring for K fold CV
# Stratified K Fold CV
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
cv_result=[] # empty list for CV results
# create loop to go through training and validation. SMOTE is applied on all Training Folds.
for train_ix, validate_ix in kfold.split(X_temp, Y_temp):
# Separate training and validation data
train_X, val_X = X_temp.iloc[train_ix], X_temp.iloc[validate_ix]
train_Y, val_Y, = Y_temp.iloc[train_ix], Y_temp.iloc[validate_ix]
# train_X and train_Y are data from k-1 folds
X_train_un, y_train_un, = data_balancing(train_X, train_Y, choice)
model.fit(X_train_un, y_train_un)
Y_pred = model.predict(val_X) # Left out fold
recall_val = recall_score(val_Y, Y_pred)
cv_result.append(recall_val)
results.append(cv_result)
names.append(name)
print('Model {}: Class Balancing + Stratified CV gives avg. Recall {}'.format(
name, round(np.mean(cv_result)*100, 2)
))
return results, names
results_SMOTE, names_SMOTE=create_model(0)
results_Rus, names_Rus=create_model(1)
results_NM, names_NM=create_model(2)
results_Ros, names_Ros=create_model(3)
results_Rus
# XGBoost recall values for the 5 validation folds
results_Rus[4]
# setting XGB to = Random Under Sampling results and checking XGB median
XGB=results_Rus[4]
print(np.median(XGB))
# Boxplots of the CV scores for all models
fig = plt.figure(figsize=(15,8))
fig.suptitle('Comparison of Random Under Sampling')
ax = fig.add_subplot(111)
plt.boxplot(results_Rus)
ax.set_xticklabels(names_Rus)
plt.show()
# define function for metrics to check performance
def model_perf_class_sklearn(model, predictors, target):
"""
Function for metrics to check model performance
model: classifier
predictors: independent variables
target: dependent variable
"""
# predict independent variables
pred = model.predict(predictors)
acc = accuracy_score(target, pred)
recall = recall_score(target, pred)
precision = precision_score(target, pred)
f1 = f1_score(target, pred)
# create dataframe for metrics
df_perf = pd.DataFrame(
{
'Accuracy': acc,
'Recall': recall,
'Precisiion': precision,
'F1': f1
},
index=[0]
)
return df_perf
# create function for confusion matrix with sklearn
def cm_sklearn(model, predictors, target):
"""
plot confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
"""
y_pred = model.predict(predictors)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
['{0:0.0f}'.format(item) + '\n{0:.2%}'.format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=labels, fmt='')
plt.ylabel('True label')
plt.xlabel('Predicted label')
# define model
model = XGBClassifier(random_state=1, eval_metric='logloss')
X_train_b, Y_train_b = data_balancing(X_temp, Y_temp, 1) # choice 1 is Random Under Sampling
# Parameter grid to pass in GridSearchCV
param_grid = {'n_estimators': np.arange(50, 100, 50),
'learning_rate': [0.01, 0.1],
'gamma': [0,1],
}
# Type of scoring used
scorer = metrics.make_scorer(metrics.recall_score)
# Run grid search
grid_obj = GridSearchCV(model, param_grid, scoring=scorer, n_jobs=-1, cv=2)
grid_obj = grid_obj.fit(X_train_b, Y_train_b) # train data
# Set clf to the best combination of parameters
xgb_estimator = grid_obj.best_estimator_
# Fit best algorithm to the data
xgb_estimator.fit(X_train_b, Y_train_b)
%%time
from imblearn.pipeline import Pipeline
# define model
model = XGBClassifier(random_state=1, eval_metic='logloss')
# define class balancing
sm = RandomUnderSampler()
# pipeline
pipeline = Pipeline([('sampling', sm), ('class', model)])
# Parameter grid to pass in GridSearchCV
param_grid={'class__n_estimators': np.arange(50, 150, 50),
'class__learning_rate': [0.001, 0.01, 0.1],
'class__gamma': [0,1],
'class__subsample': [0.8, 0.9, 1],
'class__reg_lambda':[5,10]
}
# scoring
scorer = metrics.make_scorer(metrics.recall_score)
# call GridSearchCV
grid_cv = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring=scorer, cv=5, n_jobs=-1, verbose=2)
# Fit parameters
grid_cv.fit(X_temp, Y_temp)
print('Best parameters are {} with CV score={}:'.format(grid_cv.best_params_, grid_cv.best_score_))
# building model with best parameters
X_train_over, Y_train_over = sm.fit_resample(X_temp, Y_temp)
xgb_tuned1 = XGBClassifier(
random_state=1,
n_estimators=100,
subsample=0.8,
learning_rate=0.1,
gamma=0,
eval_metric='logloss',
reg_lambda=5
)
# Fit the model on training data
xgb_tuned1.fit(X_train_over, Y_train_over)
# Calculating metrics on validation set
xgboost_grid_val = model_perf_class_sklearn(xgb_tuned1, X_test, Y_test)
print('Test Performance:')
xgboost_grid_val
cm_sklearn(xgb_tuned1, X_test, Y_test)
from sklearn.model_selection import RandomizedSearchCV
%%time
# define model
model = XGBClassifier(random_state=1, eval_metric='logloss')
# parameter grid to pass in GridSearchCV
param_grid={'class__n_estimators': np.arange(50, 150, 50),
'class__learning_rate': [0.01, 0.1],
'class__gamma': [0,1],
'class__subsample': [0.8, 0.9, 1],
'class__reg_lambda': [5, 10]}
# Type of scoring
scorer = metrics.make_scorer(metrics.recall_score)
sm = RandomUnderSampler()
pipeline = Pipeline([('sampling', sm), ('class', model)])
# Call RandomizedSearchCV
xgb_tuned2 = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, n_iter=10, scoring=scorer, cv=5, random_state=1, n_jobs= -1)
#Fit parameters in RandomizedSearchCV
xgb_tuned2.fit(X_temp, Y_temp)
print('Best parameters are {} with CV score {}:'.format(xgb_tuned2.best_params_, xgb_tuned2.best_score_))
# build model with best parameters
xgb_tuned2 = XGBClassifier(
random_state=1,
subsample=1, reg_lambda= 5, n_estimators= 100, learning_rate=0.1, gamma=1, eval_metric='logloss'
)
X_train_over, Y_train_over = sm.fit_resample(X_temp, Y_temp)
# Fit model on train data
xgb_tuned2.fit(X_train_over, Y_train_over)
# Calulate metrics on validation set
xgboost_random_val = model_perf_class_sklearn(xgb_tuned2, X_test, Y_test)
print('Test Performance:')
xgboost_random_val
cm_sklearn(xgb_tuned2, X_test, Y_test)
# define model
model = GradientBoostingClassifier(random_state=1)
X_train_b, Y_train_b = data_balancing(X_temp, Y_temp, 1)
# Grid parameters to choose from
parameters = {
"n_estimators": [100,150,200,250],
"subsample":[0.8,0.9,1],
"max_features":[0.7,0.8,0.9,1]
}
# Type of scoring
scorer = metrics.make_scorer(metrics.recall_score)
# Run grid search
grid_obj = GridSearchCV(model, parameters, scoring=scorer,cv=5)
grid_obj = grid_obj.fit(X_train_b, Y_train_b)
# Set clf to the best combination of parameters
gbc_estimator = grid_obj.best_estimator_
# Fit the best algorithm to the data.
gbc_estimator.fit(X_train_b, Y_train_b)
%%time
# define model
model = GradientBoostingClassifier(random_state=1)
# define class balancing
sm = RandomUnderSampler()
# pipeline
pipeline = Pipeline([('sampling', sm), ('class', model)])
# Parameter grid to pass in GridSearchCV
param_grid={'class__n_estimators': [100,150,200,250],
'class__subsample': [0.8, 0.9, 1],
'class__max_features':[0.7,0.8,0.9,1]
}
# scoring
scorer = metrics.make_scorer(metrics.recall_score)
# call GridSearchCV
grid_cv = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring=scorer, cv=5, n_jobs=-1, verbose=2)
# Fit parameters
grid_cv.fit(X_temp, Y_temp)
print('Best parameters are {} with CV score={}:'.format(grid_cv.best_params_, grid_cv.best_score_))
# build model with best parameters
X_train_over, Y_train_over = sm.fit_resample(X_temp, Y_temp)
gbc_tuned1 = GradientBoostingClassifier(
random_state=1,
n_estimators=250,
subsample=0.8,
max_features=0.8
)
# Fit the model on training data
gbc_tuned1.fit(X_train_over, Y_train_over)
# Calculate metrics on validation set
gbc_grid_val = model_perf_class_sklearn(gbc_tuned1, X_test, Y_test)
print('Test Performance:')
gbc_grid_val
cm_sklearn(gbc_tuned1, X_test, Y_test)
%%time
# define model
model = GradientBoostingClassifier(random_state=1)
# parameter grid to pass in GridSearchCV
param_grid={'class__n_estimators': [100,150,200,250],
'class__subsample': [0.8, 0.9, 1],
'class__max_features':[0.7,0.8,0.9,1]}
# Type of scoring
scorer = metrics.make_scorer(metrics.recall_score)
sm = RandomUnderSampler()
pipeline = Pipeline([('sampling', sm), ('class', model)])
# Call RandomizedSearchCV
gbc_tuned2 = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, n_iter=10, scoring=scorer, cv=5, random_state=1, n_jobs= -1)
#Fit parameters in RandomizedSearchCV
gbc_tuned2.fit(X_temp, Y_temp)
print('Best parameters are {} with CV score {}:'.format(gbc_tuned2.best_params_, gbc_tuned2.best_score_))
# build model with best parameters
gbc_tuned2 = XGBClassifier(
random_state=1,subsample=0.8, n_estimators= 250, max_features=0.9
)
X_train_over, Y_train_over = sm.fit_resample(X_temp, Y_temp)
# Fit model on train data
gbc_tuned2.fit(X_train_over, Y_train_over)
# Calulate metrics on validation set
gbc_random_val = model_perf_class_sklearn(gbc_tuned2, X_test, Y_test)
print('Test Performance:')
gbc_random_val
cm_sklearn(gbc_tuned2, X_test, Y_test)
# define model
model = AdaBoostClassifier(random_state=1)
X_train_b, Y_train_b = data_balancing(X_temp, Y_temp, 1)
# Grid parameters to choose from
parameters = {
#Let's try different max_depth for base_estimator
"base_estimator":[DecisionTreeClassifier(max_depth=1),DecisionTreeClassifier(max_depth=2),],
"n_estimators": np.arange(10,110,10),
"learning_rate":np.arange(0.1,2,0.1)
}
# Type of scoring
scorer = metrics.make_scorer(metrics.recall_score)
# Run grid search
grid_obj = GridSearchCV(model, parameters, scoring=scorer, cv=5)
grid_obj = grid_obj.fit(X_train_b, Y_train_b)
# Set clf to the best combination of parameters
abc_estimator = grid_obj.best_estimator_
# Fit the best algorithm to the data.
abc_estimator.fit(X_train_b, Y_train_b)
%%time
# define model
model = AdaBoostClassifier(random_state=1)
# define class balancing
sm = RandomUnderSampler()
# pipeline
pipeline = Pipeline([('sampling', sm), ('class', model)])
# Parameter grid to pass in GridSearchCV
param_grid={
'class__base_estimator':[
DecisionTreeClassifier(max_depth=1),DecisionTreeClassifier(max_depth=2)
],
'class__n_estimators': np.arange(10,110,10),
'class__learning_rate':np.arange(0.1,2,0.1)
}
# scoring
scorer = metrics.make_scorer(metrics.recall_score)
# call GridSearchCV
grid_cv = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring=scorer, cv=5, n_jobs=-1, verbose=2)
# Fit parameters
grid_cv.fit(X_temp, Y_temp)
print('Best parameters are {} with CV score={}:'.format(grid_cv.best_params_, grid_cv.best_score_))
# build model with best parameters
X_train_over, Y_train_over = sm.fit_resample(X_temp, Y_temp)
abc_tuned1 = AdaBoostClassifier(
random_state=1,
base_estimator = DecisionTreeClassifier(max_depth=2),
n_estimators = 80,
learning_rate=0.9,
)
# Fit the model on training data
abc_tuned1.fit(X_train_over, Y_train_over)
# Calculate metrics on validation set
abc_grid_val = model_perf_class_sklearn(abc_tuned1, X_test, Y_test)
print('Test Performance:')
abc_grid_val
cm_sklearn(abc_tuned1, X_test, Y_test)
%%time
# define model
model = AdaBoostClassifier(random_state=1)
# parameter grid to pass in GridSearchCV
param_grid={
'class__base_estimator':[
DecisionTreeClassifier(max_depth=1),DecisionTreeClassifier(max_depth=2)
],
'class__n_estimators': np.arange(10,110,10),
'class__learning_rate':np.arange(0.1,2,0.1)
}
# Type of scoring
scorer = metrics.make_scorer(metrics.recall_score)
sm = RandomUnderSampler()
pipeline = Pipeline([('sampling', sm), ('class', model)])
# Call RandomizedSearchCV
abc_tuned2 = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, n_iter=10, scoring=scorer, cv=5, random_state=1, n_jobs= -1)
#Fit parameters in RandomizedSearchCV
abc_tuned2.fit(X_temp, Y_temp)
print('Best parameters are {} with CV score {}:'.format(abc_tuned2.best_params_, abc_tuned2.best_score_))
# build model with best parameters
abc_tuned2 = XGBClassifier(
random_state=1, learning_rate=0.6, n_estimators= 50, max_depth=2
)
X_train_over, Y_train_over = sm.fit_resample(X_temp, Y_temp)
# Fit model on train data
abc_tuned2.fit(X_train_over, Y_train_over)
# Calulate metrics on validation set
abc_random_val = model_perf_class_sklearn(abc_tuned2, X_test, Y_test)
print('Test Performance:')
abc_random_val
cm_sklearn(abc_tuned2, X_test, Y_test)
models_test_comp_df = pd.concat(
[xgboost_grid_val.T, xgboost_random_val.T, gbc_grid_val.T, gbc_random_val.T, abc_grid_val.T, abc_random_val.T],
axis=1
)
models_test_comp_df.columns = [
'XGBoost GridSearch CV',
'XGBoost RandomizedSearch CV',
'GradientBoost GridSearch CV',
'GradientBoost RandomizedSearch CV',
'AdaBoost GridSearch CV',
'AdaBoost RandomizedSearch CV']
print("Testing performance comparison:")
models_test_comp_df
feature_names = X.columns
importances = gbc_tuned2.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12,12))
plt.title('Feature Importancers')
plt.barh(range(len(indices)), importances[indices], color='violet', align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
def myProcessingSteps(df):
# Replacing the values in Income_Category
df['Income_Category'].replace({'abc': 'Less than 40K'}, inplace=True)
# Dropping the columns that are not required
df.drop(
columns=['Avg_Open_To_Buy', 'Total_Trans_Ct', 'CLIENTNUM'],
inplace=True,
axis=1
)
return df
processing = FunctionTransformer(myProcessingSteps)
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
# creating a list of numerical variables
numerical_features = [
"Customer_Age",
"Dependent_count",
"Months_on_book",
"Total_Relationship_Count",
"Months_Inactive_12_mon",
"Contacts_Count_12_mon",
"Credit_Limit",
"Total_Revolving_Bal",
"Total_Amt_Chng_Q4_Q1",
"Total_Trans_Amt",
"Total_Ct_Chng_Q4_Q1",
"Avg_Utilization_Ratio"
]
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))])
# creating a list of categorical variables
categorical_features = [
"Gender", "Education_Level", "Marital_Status", "Income_Category", "Card_Category"
]
# creating a transformer for categorical variables, which will first apply simple imputer and
#then do one hot encoding for categorical variables
categorical_transformer = Pipeline(
steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
]
)
# combining categorical transformer and numerical transformer using a column transformer
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numerical_features),
('cat', categorical_transformer, categorical_features)
],
remainder='passthrough'
)
# remainder = "passthrough" has been used, it will allow variables that are present in original data
# but not in "numerical_columns" and "categorical_columns" to pass through the column transformer without any changes
model = Pipeline(
steps=[
('CT', processing),
('pre', preprocessor),
('class balance', RandomUnderSampler()),
('GBC', GradientBoostingClassifier(random_state=1, subsample=0.8, n_estimators=250, max_features=0.9)
)
]
)
# Separating Target variable for other variables
X = churn.drop('Attrition_Flag', axis=1)
y = churn['Attrition_Flag']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1, stratify=Y)
print(X_train.shape, X_test.shape)
# fit model on training data
model.fit(X_train, Y_train)
# calculate metrics
gradientboost_pipeline = model_perf_class_sklearn(model, X_test, Y_test)
print('Test Performance:')
gradientboost_pipeline